rm(list = ls())
gc(verbose = FALSE)

args = commandArgs(trailingOnly = "TRUE")
args

arg1 <- args[1]
arg2 <- args[2]

# Define library folder
setwd(arg1)
.libPaths(arg2)
.libPaths(.libPaths()[1])

pkg <- c("nnls", "SuperLearner", "dplyr", "arm", "onehot", "grf", "xgboost", "caret", "data.table",
         "knitr", "kableExtra", "magrittr", "kimisc", "data.table", "readstata13", "DescTools", "scales", "ggplot2")
lapply(pkg, library, character.only = T, lib.loc = arg2)


# read the data
ihwap_complete = read.csv(paste0(arg1, "/Data/ihwap_full.csv"))

# drop some variables not to be used
ihwap_full = subset(ihwap_complete, select = -c(Household, ExistingConsumption, ProjectedConsumption, end_day, kfold, kfold2, rowid))
ihwap_completeX = subset(ihwap_complete, select = -c(Household, ExistingConsumption, ProjectedConsumption, end_day, kfold, kfold2, rowid, total_mmbtu, treated))

# will train model only with untreated homes
ihwap_full = ihwap_full[which(ihwap_full$treated==0), ]

### Data Frames for the models
# dependent variable
train_y = as.matrix(subset(ihwap_full, select = c("total_mmbtu")))
# independent variables
train_x = subset(ihwap_full, select = -c(total_mmbtu, treated))

######## Setup SuperLearner
# Function to create a list of model names
expandingList <- function(capacity = 10) {
  buffer <- vector('list', capacity)
  length <- 0
  
  methods <- list()
  
  methods$double.size <- function() {
    buffer <<- c(buffer, vector('list', capacity))
    capacity <<- capacity * 2
  }
  
  methods$add <- function(val) {
    if(length == capacity) {
      methods$double.size()
    }
    
    length <<- length + 1
    buffer[[length]] <<- val
  }
  
  methods$as.list <- function() {
    b <- buffer[0:length]
    return(b)
  }
  
  methods
}
SL.library<-expandingList()


### different models to try
# xgboost
xgboost.tune <- list(ntrees = c(1000, 2000),
                     max_depth = c(20, 30),
                     shrinkage = c(0.05, 0.5),
                     minobspernode = c(30))

# Set detailed names = T so we can see the configuration for each function.
# Also shorten the name prefix.
xgboost <- create.Learner("SL.xgboost", tune = xgboost.tune, detailed_names = T, name_prefix = "xgboost")
for(i in 1:length(xgboost$names)){
  SL.library$add(c(xgboost$names[i]))
}


fit.gaselec.SL = SuperLearner(Y=train_y, X=train_x,
                                  SL.library=SL.library$as.list(), family=gaussian(),
                                  method="method.NNLS", verbose=TRUE, cvControl = list(V = 5))
saveRDS(fit.gaselec.SL, paste0(arg1, "/Results/Model_Outputs/predictpre_model_xgboost.rds"))

# predictions for pre-data
in_preds = as.data.frame(fit.gaselec.SL$library.predict)
cv_preds = as.data.frame(fit.gaselec.SL$Z)
colnames(cv_preds) =  sprintf('cv_%s', colnames(in_preds)) 
colnames(in_preds) =  sprintf('in_%s', colnames(in_preds))

data_export = subset(ihwap_complete[which(ihwap_complete$treated==0), ], select = c(Household, end_day, end_month, end_year))
data_export = as.data.frame(cbind(data_export, in_preds, cv_preds))

saveRDS(data_export, paste0(arg1, "/Results/Model_Outputs/predictpre_results_xgboost.rds"))
write.csv(data_export, paste0(arg1, "/Results/Model_Outputs/predictpre_results_xgboost.csv"), row.names = FALSE)


# predictions for post-data
predictions = predict(fit.gaselec.SL, ihwap_completeX, onlySL = T)
data_export2 = subset(ihwap_complete, select = c(Household, end_day, end_month, end_year))
data_export2$pred_xgboost = unlist(as.numeric(predictions$pred))

saveRDS(data_export2, paste0(arg1, "/Results/Model_Outputs/predictpost_results_xgboost.rds"))
write.csv(data_export2, paste0(arg1, "/Results/Model_Outputs/predictpost_results_xgboost.csv"), row.names = FALSE)
